Clean up the Data
library(readr)
library(dplyr)
library(stringr)
df <- read.csv('raw_data/coffee_compare.csv')
coffee <- df %>% select(DBA, reinspections, checks, violations,
score, inspections, BORO, SCORE)
coffee$DBA = ifelse(str_detect(coffee$DBA, "DUNKIN"), 'DD', 'Starbucks')
coffee$DBA %>% table()
## .
## DD Starbucks
## 625 346
coffee$BORO %>% table()
## .
## BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
## 103 183 412 230 43
Visualization
library(ggthemes)
library(ggplot2)
library(plotly)
coffee_new <- coffee %>%
group_by(DBA, BORO) %>%
summarize(Value = n())
pc <- ggplot(coffee_new, aes(fill=DBA, y=Value, x=BORO)) +
geom_bar(position="dodge", stat="identity", width = 0.5) +
xlab('Neighborhood') +
ylab('Health Violations') +
labs(caption = 'Data Source: DOHMH',
fill = 'Brands') +
ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
theme_bw() +
scale_fill_manual(values = c("#f09a56", "#87dc97")) +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
pc

pc1 <- ggplot(coffee, aes(x = BORO, y = violations, color = DBA)) +
geom_point(alpha = 0.5) +
xlab('Neighborhood') +
ylab('Health Violations') +
labs(caption = 'DOHMH') +
ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
theme_bw() +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8)) +
scale_color_manual(values = c("DD" = "#f09a56",
'Starbucks' = "#87dc97"))
ggplotly(pc1)
pc2 <- ggplot(coffee, aes(x = BORO, y = SCORE, color = DBA)) +
geom_point(alpha = 0.5) +
xlab('Neighborhood') +
ylab('Total Score for a Particular Inspection') +
labs(caption = 'DOHMH') +
ggtitle('Score of Coffee Brands by Neighborhoods') +
theme_bw() +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8)) +
scale_color_manual(values = c("DD" = "#f09a56",
'Starbucks' = "#87dc97"))
ggplotly(pc2)
Supervised Machine Learning
Question: Whether can be told the coffee store is Starbucks or not?
Binary outcome in this case.
library(caret)
coffee$DBA = ifelse(coffee$DBA == "DD", 0, 1)
coffee$DBA <- factor(coffee$DBA,
labels = c("Starbucks", "DD"),
levels = 1:0)
set.seed(12345)
in_train <- createDataPartition(y = coffee$DBA,
p = 0.8, list = FALSE)
training <- coffee[ in_train, ]
testing <- coffee[-in_train, ]
logit <- glm(DBA ~ checks + violations + score + BORO,
data = training, family = binomial(link = "logit"))
y_hat_logit <- predict(logit, newdata = testing, type = "response")
z_logit <- factor(y_hat_logit > 0.5,
levels = c(TRUE, FALSE),
labels = c("Starbucks", "DD"))
confusionMatrix(z_logit, reference = testing$DBA)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Starbucks DD
## Starbucks 34 101
## DD 35 24
##
## Accuracy : 0.299
## 95% CI : (0.2355, 0.3687)
## No Information Rate : 0.6443
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.2596
##
## Mcnemar's Test P-Value : 2.494e-08
##
## Sensitivity : 0.4928
## Specificity : 0.1920
## Pos Pred Value : 0.2519
## Neg Pred Value : 0.4068
## Prevalence : 0.3557
## Detection Rate : 0.1753
## Detection Prevalence : 0.6959
## Balanced Accuracy : 0.3424
##
## 'Positive' Class : Starbucks
##
LDA <- train(DBA ~ checks + violations + score + BORO,
data = training, method = "lda",
reProcess = c("center", "scale"))
z <- predict(LDA, newdata = testing)
confusionMatrix(z, testing$DBA)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Starbucks DD
## Starbucks 38 26
## DD 31 99
##
## Accuracy : 0.7062
## 95% CI : (0.6367, 0.7693)
## No Information Rate : 0.6443
## P-Value [Acc > NIR] : 0.04083
##
## Kappa : 0.3484
##
## Mcnemar's Test P-Value : 0.59624
##
## Sensitivity : 0.5507
## Specificity : 0.7920
## Pos Pred Value : 0.5938
## Neg Pred Value : 0.7615
## Prevalence : 0.3557
## Detection Rate : 0.1959
## Detection Prevalence : 0.3299
## Balanced Accuracy : 0.6714
##
## 'Positive' Class : Starbucks
##